### Load packages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
trips <- read_csv('https://sds-aau.github.io/SDS-master/M1/data/trips.csv')
trips %>% glimpse()
Rows: 46,510
Columns: 11
$ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,…
$ username <chr> "@lewellenmichael", "@lewellenmichael", "@lewellenmichael", "@lewellenmichael", "@waylandchin…
$ country <chr> "Mexico", "Mexico", "Mexico", "Jordan", "China", "Vietnam", "Hong Kong", "China", "China", "C…
$ country_code <chr> "MX", "MX", "MX", "JO", "CN", "VN", "HK", "CN", "CN", "CN", "TH", "MY", "KH", "VN", "IN", "IN…
$ country_slug <chr> "mexico", "mexico", "mexico", "jordan", "china", "vietnam", "hong-kong", "china", "china", "c…
$ date_end <date> 2018-06-15, 2018-06-03, 2017-11-05, 2017-08-07, 2017-03-18, 2017-02-16, 2016-09-01, 2016-08-…
$ date_start <date> 2018-06-04, 2018-05-31, 2017-11-01, 2017-07-24, 2017-02-17, 2016-09-02, 2016-08-02, 2016-07-…
$ latitude <dbl> 21, 19, 21, 31, 40, 10, 22, 22, 22, 18, 7, 3, 11, 10, 13, 26, 27, 27, 28, 28, 19, 11, 22, 22,…
$ longitude <dbl> -101, -99, -86, 35, 122, 106, 114, 114, 113, 109, 98, 101, 104, 106, 80, 75, 78, 78, 77, 77, …
$ place <chr> "Guanajuato", "Mexico City", "Cancun", "Amman", "Yingkou", "Ho Chi Minh City", "Shenzhen", "H…
$ place_slug <chr> "mexico", "mexico-city-mexico", "cancun-mexico", "amman-jordan", "china", "ho-chi-minh-city-v…
people <- read_csv('https://sds-aau.github.io/SDS-master/M1/data/people.csv')
people %>% glimpse()
Rows: 4,016
Columns: 6
$ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25…
$ username <chr> "@lewellenmichael", "@waylandchin", "@karan", "@skaboss217", "@apwn", "@samcalma", "@paulbre…
$ followers <dbl> 1, 0, 2, 0, 17, 3, 4, 2, 17, 2, 11, 11, 5, 8, 0, 9, 3, 5, 25, 1, 1, 1, 61, 2, 11, 0, 1, 2, 9…
$ following <dbl> 2, 2, 1, 1, 426, 3, 9, 3, 23, 2, 17, 6, 9, 7, 1, 6, 3, 34, 23, 4, 4, 4, 120, 2, 10, 2, 2, 5,…
$ work_raw <chr> "Software Dev, Startup Founder, Finance, Crypto, Product Manager, Education, Data, Ecommerce…
$ education_raw <chr> "High School, Bachelor's Degree", NA, NA, NA, NA, NA, NA, "High School, Bachelor's Degree, M…
countries <- read_csv( 'https://sds-aau.github.io/SDS-master/M1/data/countrylist.csv')
countries %>% glimpse()
Rows: 249
Columns: 3
$ alpha_2 <chr> "AF", "AX", "AL", "DZ", "AS", "AD", "AO", "AI", "AQ", "AG", "AR", "AM", "AW", "AU", "AT", "AZ",…
$ region <chr> "Asia", "Europe", "Europe", "Africa", "Oceania", "Europe", "Africa", "Americas", NA, "Americas"…
$ sub_region <chr> "Southern Asia", "Northern Europe", "Southern Europe", "Northern Africa", "Polynesia", "Souther…
countries <- read_csv( 'https://sds-aau.github.io/SDS-master/M1/data/countrylist.csv')
countries %>% glimpse()
Rows: 249
Columns: 3
$ alpha_2 <chr> "AF", "AX", "AL", "DZ", "AS", "AD", "AO", "AI", "AQ", "AG", "AR", "AM", "AW", "AU", "AT", "AZ",…
$ region <chr> "Asia", "Europe", "Europe", "Africa", "Oceania", "Europe", "Africa", "Americas", NA, "Americas"…
$ sub_region <chr> "Southern Asia", "Northern Europe", "Southern Europe", "Northern Africa", "Polynesia", "Souther…
cities <- read_delim('https://sds-aau.github.io/SDS-master/M1/data/nomad_cities.csv', delim = '\t')
cities %>% glimpse()
Rows: 781
Columns: 27
$ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 2…
$ coffee_in_cafe <dbl> 1.73, 0.85, 1.99, 1.88, 5.00, 4.00, 5.38, 5.00, 5.00, 4.03, 3.50, 2.69, 4.30, 4.97,…
$ cost_beer <dbl> 1.73, 0.85, 1.99, 1.88, 5.00, 4.00, 5.38, 5.00, 5.00, 4.03, 3.50, 2.69, 4.30, 4.97,…
$ cost_coworking <dbl> 152.41, 98.88, 159.13, 47.01, 200.00, 250.00, 161.30, 300.00, 490.00, 96.78, 300.00…
$ cost_expat <dbl> 1273, 780, 1653, 1640, 3309, 4325, 2197, 2691, 3764, 1859, 2760, 1357, 2075, 2167, …
$ cost_nomad <dbl> 1364, 777, 1639, 1545, 3028, 3238, 2554, 3503, 3427, 2245, 2956, 1681, 2528, 2408, …
$ female_friendly <dbl> 1.00, 0.80, 1.00, 1.00, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.76, 1.00, 1.00,…
$ fragile_states_index <chr> "52.7", "78.8", "40.8", "DotMap(__next__=DotMap())", "34", "34", "39.8", "34", "34"…
$ free_wifi_available <dbl> 0.40, 0.60, 0.60, 1.00, 0.60, 1.00, 0.60, 0.40, 1.00, 0.24, 0.60, 0.62, 0.76, 1.00,…
$ freedom_score <chr> "0.6", "0.2", "0.8", "0.6", "0.6", "0.6", "0.8", "0.6", "0.6", "0.8", "0.6", "0.8",…
$ friendly_to_foreigners <dbl> 0.60, 0.60, 0.80, 0.80, 0.80, 0.80, 0.80, 1.00, 1.00, 0.80, 0.80, 0.83, 0.40, 1.00,…
$ internet_speed <dbl> 31, 14, 15, 16, 118, 81, 18, 23, 55, 24, 99, 21, 38, 11, 19, 17, 5, 20, 15, 55, 101…
$ latitude <dbl> 47.497912, 18.787747, 50.075538, 25.091075, 30.267153, 25.761680, 40.416775, 45.523…
$ leisure <dbl> 0.80, 0.62, 1.00, 1.00, 1.00, 1.00, 0.60, 1.00, 0.60, 0.78, 0.80, 0.63, 0.60, 0.60,…
$ lgbt_friendly <dbl> 0.27, 0.60, 0.60, 0.80, 0.60, 1.00, 1.00, 0.80, 0.80, 1.00, 1.00, 0.64, 0.60, 1.00,…
$ life_score <dbl> 0.86, 0.75, 0.83, 0.93, 0.95, 1.00, 0.88, 0.95, 0.92, 0.85, 0.87, 0.84, 0.87, 0.89,…
$ longitude <dbl> 19.040235, 98.993128, 14.437800, 121.559834, -97.743061, -80.191790, -3.703790, -12…
$ nightlife <dbl> 1.00, 0.40, 1.00, 0.60, 1.00, 1.00, 0.80, 1.00, 1.00, 0.80, 0.60, 0.80, 0.60, 0.60,…
$ nomadScore <dbl> 1.00, 0.95, 0.94, 0.94, 0.94, 0.92, 0.90, 0.90, 0.89, 0.88, 0.88, 0.88, 0.87, 0.87,…
$ nomad_score <dbl> 1.00, 0.95, 0.94, 0.94, 0.94, 0.92, 0.90, 0.90, 0.89, 0.88, 0.88, 0.88, 0.87, 0.87,…
$ peace_score <chr> "0.8", "0.4", "0.8", "DotMap(__next__=DotMap())", "0.8", "0.8", "0.8", "0.8", "0.8"…
$ place <chr> "Budapest", "Chiang Mai", "Prague", "Taipei", "Austin", "Miami", "Madrid", "Portlan…
$ places_to_work <dbl> 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8…
$ press_freedom_index <chr> "28.17", "44.53", "16.66", "24.37", "22.49", "22.49", "19.92", "22.49", "22.49", "1…
$ racism <dbl> 0.40, 0.40, 0.42, 0.00, 0.80, 0.80, 0.60, 0.80, 0.80, 1.00, 0.80, 1.00, 0.40, 1.00,…
$ safety <dbl> 0.60, 0.80, 0.80, 1.00, 0.73, 0.73, 0.80, 0.80, 0.60, 0.80, 0.40, 0.80, 0.60, 0.80,…
$ weed <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
cities <- cities %>%
mutate(fragile_states_index = fragile_states_index %>% as.numeric(),
peace_score = peace_score %>% as.numeric(),
fredom_score = freedom_score %>% as.numeric(),
press_freedom_index = press_freedom_index %>% as.numeric())
cities %>% glimpse()
Rows: 781
Columns: 28
$ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 2…
$ coffee_in_cafe <dbl> 1.73, 0.85, 1.99, 1.88, 5.00, 4.00, 5.38, 5.00, 5.00, 4.03, 3.50, 2.69, 4.30, 4.97,…
$ cost_beer <dbl> 1.73, 0.85, 1.99, 1.88, 5.00, 4.00, 5.38, 5.00, 5.00, 4.03, 3.50, 2.69, 4.30, 4.97,…
$ cost_coworking <dbl> 152.41, 98.88, 159.13, 47.01, 200.00, 250.00, 161.30, 300.00, 490.00, 96.78, 300.00…
$ cost_expat <dbl> 1273, 780, 1653, 1640, 3309, 4325, 2197, 2691, 3764, 1859, 2760, 1357, 2075, 2167, …
$ cost_nomad <dbl> 1364, 777, 1639, 1545, 3028, 3238, 2554, 3503, 3427, 2245, 2956, 1681, 2528, 2408, …
$ female_friendly <dbl> 1.00, 0.80, 1.00, 1.00, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.76, 1.00, 1.00,…
$ fragile_states_index <dbl> 52.7, 78.8, 40.8, NA, 34.0, 34.0, 39.8, 34.0, 34.0, 39.8, 34.0, 29.2, 29.0, 21.3, 2…
$ free_wifi_available <dbl> 0.40, 0.60, 0.60, 1.00, 0.60, 1.00, 0.60, 0.40, 1.00, 0.24, 0.60, 0.62, 0.76, 1.00,…
$ freedom_score <chr> "0.6", "0.2", "0.8", "0.6", "0.6", "0.6", "0.8", "0.6", "0.6", "0.8", "0.6", "0.8",…
$ friendly_to_foreigners <dbl> 0.60, 0.60, 0.80, 0.80, 0.80, 0.80, 0.80, 1.00, 1.00, 0.80, 0.80, 0.83, 0.40, 1.00,…
$ internet_speed <dbl> 31, 14, 15, 16, 118, 81, 18, 23, 55, 24, 99, 21, 38, 11, 19, 17, 5, 20, 15, 55, 101…
$ latitude <dbl> 47.497912, 18.787747, 50.075538, 25.091075, 30.267153, 25.761680, 40.416775, 45.523…
$ leisure <dbl> 0.80, 0.62, 1.00, 1.00, 1.00, 1.00, 0.60, 1.00, 0.60, 0.78, 0.80, 0.63, 0.60, 0.60,…
$ lgbt_friendly <dbl> 0.27, 0.60, 0.60, 0.80, 0.60, 1.00, 1.00, 0.80, 0.80, 1.00, 1.00, 0.64, 0.60, 1.00,…
$ life_score <dbl> 0.86, 0.75, 0.83, 0.93, 0.95, 1.00, 0.88, 0.95, 0.92, 0.85, 0.87, 0.84, 0.87, 0.89,…
$ longitude <dbl> 19.040235, 98.993128, 14.437800, 121.559834, -97.743061, -80.191790, -3.703790, -12…
$ nightlife <dbl> 1.00, 0.40, 1.00, 0.60, 1.00, 1.00, 0.80, 1.00, 1.00, 0.80, 0.60, 0.80, 0.60, 0.60,…
$ nomadScore <dbl> 1.00, 0.95, 0.94, 0.94, 0.94, 0.92, 0.90, 0.90, 0.89, 0.88, 0.88, 0.88, 0.87, 0.87,…
$ nomad_score <dbl> 1.00, 0.95, 0.94, 0.94, 0.94, 0.92, 0.90, 0.90, 0.89, 0.88, 0.88, 0.88, 0.87, 0.87,…
$ peace_score <dbl> 0.8, 0.4, 0.8, NA, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6,…
$ place <chr> "Budapest", "Chiang Mai", "Prague", "Taipei", "Austin", "Miami", "Madrid", "Portlan…
$ places_to_work <dbl> 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8…
$ press_freedom_index <dbl> 28.17, 44.53, 16.66, 24.37, 22.49, 22.49, 19.92, 22.49, 22.49, 19.92, 22.49, 17.27,…
$ racism <dbl> 0.40, 0.40, 0.42, 0.00, 0.80, 0.80, 0.60, 0.80, 0.80, 1.00, 0.80, 1.00, 0.40, 1.00,…
$ safety <dbl> 0.60, 0.80, 0.80, 1.00, 0.73, 0.73, 0.80, 0.80, 0.60, 0.80, 0.40, 0.80, 0.60, 0.80,…
$ weed <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
$ fredom_score <dbl> 0.6, 0.2, 0.8, 0.6, 0.6, 0.6, 0.8, 0.6, 0.6, 0.8, 0.6, 0.8, 0.8, 0.8, 0.8, 0.6, 0.2…
# Variables for descriptives
vars.desc <- c("nomad_score", "cost_nomad", "places_to_work", "freedom_score", "friendly_to_foreigners", "life_score")
First, lets look at a classical correlation matrix.
ggcorr(cities[,vars.desc], label = TRUE, label_size = 3, label_round = 2, label_alpha = TRUE)
library(FactoMineR)
library(factoextra)
cities <- cities %>%
select(-X1) %>%
drop_na()
res_pca <- cities %>%
select_if(is_numeric) %>%
PCA(scale.unit = TRUE, graph = TRUE)
res_pca %>%
fviz_screeplot(addlabels = TRUE,
ncp = 10,
ggtheme = theme_gray())
res_pca %>%
fviz_pca_var(alpha.var = "cos2",
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE,
ggtheme = theme_gray())
res_pca %>%
fviz_pca_biplot(alpha.ind = "cos2",
col.ind = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
geom = "point",
ggtheme = theme_gray())
cities %>%
drop_na() %>%
select_if(is_numeric) %>%
scale() %>%
fviz_nbclust(kmeans, method = "wss")
hc <- cities %>%
select_if(is_numeric) %>%
hcut(hc_func = "hclust",
k = 3,
stand = TRUE)
hc %>%
glimpse()
List of 12
$ merge : int [1:753, 1:2] -620 -673 -624 -560 -584 -619 -646 -72 -600 -688 ...
$ height : num [1:753] 0.208 0.282 0.322 0.376 0.44 ...
$ order : int [1:754] 49 63 109 320 198 139 178 10 4 5 ...
$ labels : NULL
$ method : chr "ward.D2"
$ call : language stats::hclust(d = x, method = hc_method)
$ dist.method: chr "euclidean"
$ cluster : int [1:754] 1 2 1 1 1 1 1 1 1 1 ...
$ nbclust : num 3
$ silinfo :List of 3
..$ widths :'data.frame': 754 obs. of 3 variables:
.. ..$ cluster : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
.. ..$ neighbor : num [1:754] 2 2 2 2 2 2 2 2 2 2 ...
.. ..$ sil_width: num [1:754] 0.378 0.372 0.365 0.364 0.362 ...
..$ clus.avg.widths: num [1:3] 0.2186 0.0856 0.1571
..$ avg.width : num 0.163
$ size : int [1:3] 350 239 165
$ data : num [1:754, 1:25] -0.814 -1.272 -0.679 0.889 0.368 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : NULL
.. ..$ : chr [1:25] "coffee_in_cafe" "cost_beer" "cost_coworking" "cost_expat" ...
..- attr(*, "scaled:center")= Named num [1:25] 3.29 3.29 209.08 1874.17 2304.65 ...
.. ..- attr(*, "names")= chr [1:25] "coffee_in_cafe" "cost_beer" "cost_coworking" "cost_expat" ...
..- attr(*, "scaled:scale")= Named num [1:25] 1.92 1.92 173.91 1256.66 1081.81 ...
.. ..- attr(*, "names")= chr [1:25] "coffee_in_cafe" "cost_beer" "cost_coworking" "cost_expat" ...
- attr(*, "class")= chr [1:2] "hclust" "hcut"
hc %>%
fviz_cluster(data = cities %>% select_if(is_numeric),
ggtheme = theme_gray())
hc$cluster
[1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 2 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[57] 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 2 1 1 1 1
[113] 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1
[169] 2 2 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 2
[225] 2 1 1 1 2 1 1 2 1 1 2 2 1 1 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1
[281] 1 1 1 1 1 2 1 1 1 3 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 3 1 2 1 2 2 1 1 2 1 1 2 1 2 1 2 1 1 3 1 1 1 1
[337] 1 2 3 1 2 2 1 1 2 1 1 2 1 1 1 2 1 2 1 1 1 2 2 2 1 2 2 2 2 1 1 2 1 2 2 2 1 2 2 1 2 1 1 2 2 1 1 2 1 2 3 1 2 2 1 2
[393] 3 3 1 1 1 3 3 2 1 1 1 2 1 2 2 1 1 2 1 2 2 1 2 3 1 1 1 1 3 2 1 2 1 1 1 2 2 2 2 1 1 3 1 1 1 3 2 2 2 2 1 3 2 3 2 1
[449] 2 2 2 1 2 1 1 2 2 2 1 1 2 1 3 1 2 2 1 3 2 2 2 1 2 3 2 3 1 1 3 2 3 3 2 3 1 2 2 1 2 2 3 1 1 3 2 1 2 1 2 2 3 1 1 2
[505] 2 2 2 1 2 2 1 3 2 3 1 2 2 2 2 3 3 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 3 3 3 1 2 2 3 2 3 1 3 2 2 2 3 2 3
[561] 2 2 3 2 2 2 2 2 3 2 2 3 2 3 3 2 1 3 2 3 3 1 2 3 2 2 3 3 2 2 3 2 1 3 2 2 2 2 1 3 2 2 2 2 2 3 2 3 3 3 2 2 1 2 3 2
[617] 3 3 3 3 3 3 3 3 2 3 3 2 2 3 1 3 2 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 2 3 3 3 3 2 2 2 3 3 2 3 3 3 3 3 3 2 1
[673] 3 3 2 3 3 3 2 3 3 3 3 2 2 3 3 3 2 3 2 2 3 3 2 2 2 3 3 1 3 3 3 3 3 3 3 2 3 3 3 3 2 3 2 2 3 3 3 3 3 3 3 3 3 3 3 3
[729] 3 2 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
res_pca %>%
glimpse()
List of 5
$ eig : num [1:25, 1:3] 10.28 2.25 1.52 1.29 1.06 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:25] "comp 1" "comp 2" "comp 3" "comp 4" ...
.. ..$ : chr [1:3] "eigenvalue" "percentage of variance" "cumulative percentage of variance"
$ var :List of 4
..$ coord : num [1:25, 1:5] 0.727 0.727 0.391 0.561 0.623 ...
.. ..- attr(*, "dimnames")=List of 2
..$ cor : num [1:25, 1:5] 0.727 0.727 0.391 0.561 0.623 ...
.. ..- attr(*, "dimnames")=List of 2
..$ cos2 : num [1:25, 1:5] 0.528 0.528 0.153 0.315 0.388 ...
.. ..- attr(*, "dimnames")=List of 2
..$ contrib: num [1:25, 1:5] 5.14 5.14 1.49 3.07 3.78 ...
.. ..- attr(*, "dimnames")=List of 2
$ ind :List of 4
..$ coord : num [1:754, 1:5] 1.973 -0.898 3.461 5.178 5.477 ...
.. ..- attr(*, "dimnames")=List of 2
..$ cos2 : num [1:754, 1:5] 0.1245 0.0321 0.3376 0.43 0.5844 ...
.. ..- attr(*, "dimnames")=List of 2
..$ contrib: num [1:754, 1:5] 0.0502 0.0104 0.1546 0.346 0.3871 ...
.. ..- attr(*, "dimnames")=List of 2
..$ dist : Named num [1:754] 5.59 5.01 5.96 7.9 7.16 ...
.. ..- attr(*, "names")= chr [1:754] "1" "2" "3" "4" ...
$ svd :List of 3
..$ vs: num [1:25] 3.21 1.5 1.23 1.13 1.03 ...
..$ U : num [1:754, 1:5] 0.615 -0.28 1.08 1.615 1.708 ...
..$ V : num [1:25, 1:5] 0.227 0.227 0.122 0.175 0.194 ...
$ call:List of 9
..$ row.w : num [1:754] 0.00133 0.00133 0.00133 0.00133 0.00133 ...
..$ col.w : num [1:25] 1 1 1 1 1 1 1 1 1 1 ...
..$ scale.unit: logi TRUE
..$ ncp : num 5
..$ centre : num [1:25] 3.29 3.29 209.08 1874.17 2304.65 ...
..$ ecart.type: num [1:25] 1.92 1.92 173.8 1255.83 1081.09 ...
..$ X :'data.frame': 754 obs. of 25 variables:
.. ..$ coffee_in_cafe : num [1:754] 1.73 0.85 1.99 5 4 5.38 5 5 4.03 3.5 ...
.. ..$ cost_beer : num [1:754] 1.73 0.85 1.99 5 4 5.38 5 5 4.03 3.5 ...
.. ..$ cost_coworking : num [1:754] 152.4 98.9 159.1 200 250 ...
.. ..$ cost_expat : num [1:754] 1273 780 1653 3309 4325 ...
.. ..$ cost_nomad : num [1:754] 1364 777 1639 3028 3238 ...
.. ..$ female_friendly : num [1:754] 1 0.8 1 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ...
.. ..$ fragile_states_index : num [1:754] 52.7 78.8 40.8 34 34 39.8 34 34 39.8 34 ...
.. ..$ free_wifi_available : num [1:754] 0.4 0.6 0.6 0.6 1 0.6 0.4 1 0.24 0.6 ...
.. ..$ friendly_to_foreigners: num [1:754] 0.6 0.6 0.8 0.8 0.8 0.8 1 1 0.8 0.8 ...
.. ..$ internet_speed : num [1:754] 31 14 15 118 81 18 23 55 24 99 ...
.. ..$ latitude : num [1:754] 47.5 18.8 50.1 30.3 25.8 ...
.. ..$ leisure : num [1:754] 0.8 0.62 1 1 1 0.6 1 0.6 0.78 0.8 ...
.. ..$ lgbt_friendly : num [1:754] 0.27 0.6 0.6 0.6 1 1 0.8 0.8 1 1 ...
.. ..$ life_score : num [1:754] 0.86 0.75 0.83 0.95 1 0.88 0.95 0.92 0.85 0.87 ...
.. ..$ longitude : num [1:754] 19 99 14.4 -97.7 -80.2 ...
.. ..$ nightlife : num [1:754] 1 0.4 1 1 1 0.8 1 1 0.8 0.6 ...
.. ..$ nomadScore : num [1:754] 1 0.95 0.94 0.94 0.92 0.9 0.9 0.89 0.88 0.88 ...
.. ..$ nomad_score : num [1:754] 1 0.95 0.94 0.94 0.92 0.9 0.9 0.89 0.88 0.88 ...
.. ..$ peace_score : num [1:754] 0.8 0.4 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ...
.. ..$ places_to_work : num [1:754] 1 0.8 1 1 1 1 1 1 0.8 1 ...
.. ..$ press_freedom_index : num [1:754] 28.2 44.5 16.7 22.5 22.5 ...
.. ..$ racism : num [1:754] 0.4 0.4 0.42 0.8 0.8 0.6 0.8 0.8 1 0.8 ...
.. ..$ safety : num [1:754] 0.6 0.8 0.8 0.73 0.73 0.8 0.8 0.6 0.8 0.4 ...
.. ..$ weed : num [1:754] 0 0 1 0 0 1 1 1 1 0 ...
.. ..$ fredom_score : num [1:754] 0.6 0.2 0.8 0.6 0.6 0.8 0.6 0.6 0.8 0.6 ...
..$ row.w.init: num [1:754] 1 1 1 1 1 1 1 1 1 1 ...
..$ call : language PCA(X = ., scale.unit = TRUE, graph = TRUE)
- attr(*, "class")= chr [1:2] "PCA" "list "
cities[,"pca1"] <- res_pca$ind$coord[,1]
cities[,"pca2"] <- res_pca$ind$coord[,2]
x <-trips %>%
count(place, country_code)
x <- x %>%
left_join(countries, by = c('country_code' = 'alpha_2'))
cities <- cities %>%
left_join(x, by = 'place')
cities %>%
glimpse()
Rows: 857
Columns: 34
$ coffee_in_cafe <dbl> 1.73, 0.85, 0.85, 1.99, 1.99, 5.00, 5.00, 4.00, 5.38, 5.00, 5.00, 5.00, 5.00, 4.03,…
$ cost_beer <dbl> 1.73, 0.85, 0.85, 1.99, 1.99, 5.00, 5.00, 4.00, 5.38, 5.00, 5.00, 5.00, 5.00, 4.03,…
$ cost_coworking <dbl> 152.41, 98.88, 98.88, 159.13, 159.13, 200.00, 200.00, 250.00, 161.30, 300.00, 300.0…
$ cost_expat <dbl> 1273, 780, 780, 1653, 1653, 3309, 3309, 4325, 2197, 2691, 2691, 2691, 3764, 1859, 2…
$ cost_nomad <dbl> 1364, 777, 777, 1639, 1639, 3028, 3028, 3238, 2554, 3503, 3503, 3503, 3427, 2245, 2…
$ female_friendly <dbl> 1.00, 0.80, 0.80, 1.00, 1.00, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80,…
$ fragile_states_index <dbl> 52.7, 78.8, 78.8, 40.8, 40.8, 34.0, 34.0, 34.0, 39.8, 34.0, 34.0, 34.0, 34.0, 39.8,…
$ free_wifi_available <dbl> 0.40, 0.60, 0.60, 0.60, 0.60, 0.60, 0.60, 1.00, 0.60, 0.40, 0.40, 0.40, 1.00, 0.24,…
$ freedom_score <chr> "0.6", "0.2", "0.2", "0.8", "0.8", "0.6", "0.6", "0.6", "0.8", "0.6", "0.6", "0.6",…
$ friendly_to_foreigners <dbl> 0.60, 0.60, 0.60, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 1.00, 1.00, 1.00, 1.00, 0.80,…
$ internet_speed <dbl> 31, 14, 14, 15, 15, 118, 118, 81, 18, 23, 23, 23, 55, 24, 99, 21, 38, 11, 19, 17, 5…
$ latitude <dbl> 47.497912, 18.787747, 18.787747, 50.075538, 50.075538, 30.267153, 30.267153, 25.761…
$ leisure <dbl> 0.80, 0.62, 0.62, 1.00, 1.00, 1.00, 1.00, 1.00, 0.60, 1.00, 1.00, 1.00, 0.60, 0.78,…
$ lgbt_friendly <dbl> 0.27, 0.60, 0.60, 0.60, 0.60, 0.60, 0.60, 1.00, 1.00, 0.80, 0.80, 0.80, 0.80, 1.00,…
$ life_score <dbl> 0.86, 0.75, 0.75, 0.83, 0.83, 0.95, 0.95, 1.00, 0.88, 0.95, 0.95, 0.95, 0.92, 0.85,…
$ longitude <dbl> 19.040235, 98.993128, 98.993128, 14.437800, 14.437800, -97.743061, -97.743061, -80.…
$ nightlife <dbl> 1.00, 0.40, 0.40, 1.00, 1.00, 1.00, 1.00, 1.00, 0.80, 1.00, 1.00, 1.00, 1.00, 0.80,…
$ nomadScore <dbl> 1.00, 0.95, 0.95, 0.94, 0.94, 0.94, 0.94, 0.92, 0.90, 0.90, 0.90, 0.90, 0.89, 0.88,…
$ nomad_score <dbl> 1.00, 0.95, 0.95, 0.94, 0.94, 0.94, 0.94, 0.92, 0.90, 0.90, 0.90, 0.90, 0.89, 0.88,…
$ peace_score <dbl> 0.8, 0.4, 0.4, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 1.0, 1.0…
$ place <chr> "Budapest", "Chiang Mai", "Chiang Mai", "Prague", "Prague", "Austin", "Austin", "Mi…
$ places_to_work <dbl> 1.0, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 1.0, 1.0, 1.0…
$ press_freedom_index <dbl> 28.17, 44.53, 44.53, 16.66, 16.66, 22.49, 22.49, 22.49, 19.92, 22.49, 22.49, 22.49,…
$ racism <dbl> 0.40, 0.40, 0.40, 0.42, 0.42, 0.80, 0.80, 0.80, 0.60, 0.80, 0.80, 0.80, 0.80, 1.00,…
$ safety <dbl> 0.60, 0.80, 0.80, 0.80, 0.80, 0.73, 0.73, 0.73, 0.80, 0.80, 0.80, 0.80, 0.60, 0.80,…
$ weed <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,…
$ fredom_score <dbl> 0.6, 0.2, 0.2, 0.8, 0.8, 0.6, 0.6, 0.6, 0.8, 0.6, 0.6, 0.6, 0.6, 0.8, 0.6, 0.8, 0.8…
$ cluster <int> 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2,…
$ pca1 <dbl> 1.9727643, -0.8977418, -0.8977418, 3.4605600, 3.4605600, 5.1777721, 5.1777721, 5.47…
$ pca2 <dbl> 3.324909740, 3.510553919, 3.510553919, 3.324859942, 3.324859942, 1.446367545, 1.446…
$ country_code <chr> "HU", "TH", NA, "CZ", NA, "US", NA, "US", "ES", "UK", "US", NA, "US", "ES", "US", "…
$ n <int> 459, 913, 1, 280, 60, 246, 1, 175, 296, 1, 180, 2, 213, 93, 112, 176, 204, 69, 256,…
$ region <chr> "Europe", "Asia", "Africa", "Europe", "Africa", "Americas", "Africa", "Americas", "…
$ sub_region <chr> "Eastern Europe", "South-eastern Asia", "Sub-Saharan Africa", "Eastern Europe", "Su…
table(cities$cluster, cities$region)
Africa Americas Asia Europe Oceania
1 34 144 30 154 20
2 26 60 73 62 0
3 15 8 62 4 0
table(cities$cluster, cities$sub_region)
Australia and New Zealand Central Asia Eastern Asia Eastern Europe Latin America and the Caribbean
1 20 0 18 6 11
2 0 1 3 37 56
3 0 2 29 3 8
Northern Africa Northern America Northern Europe South-eastern Asia Southern Asia Southern Europe
1 1 133 44 3 0 46
2 6 4 3 37 21 22
3 4 0 0 7 21 1
Sub-Saharan Africa Western Asia Western Europe
1 33 9 58
2 20 11 0
3 11 3 0
cities %>%
group_by(cluster) %>%
summarise(pca1 = pca1 %>% mean(),
pca2 = pca2 %>% mean())
trips %>%
count(place, sort = TRUE, name = 'n_city')
cities <- cities %>%
left_join(trips %>% count(place, sort = TRUE, name = 'n_city'), by = 'place')
To finish up, lets plot it in a map, simplest way possible.
library(ggmap)
mapWorld <- borders("world", colour = "gray50", fill = "gray50")
mp <- ggplot() +
mapWorld
mp +
geom_point(aes(x = cities$longitude, y = cities$latitude) , color = cities$cluster)
NA